Máster en Data Science UAH

Tasador de viviendas de alquiler vacacional en Madrid

Notebook #2 - Selección de modelo base

Alumno: Héctor Mateos Oblanca
Tutor: Daniel Rodríguez Pérez

Intro

In [1]:
city = 'madrid'
month = '201909'
filename_in = 'src/data/' + city + '-' + month + '-listings-CLEAN.csv'

# modelos para la comparativa
enabled_models = [
    'LINEAL', 
    'RIDGE', 
    'LASSO', 
    'SGD', 
    'R.FOREST', 
    'ADABOOST', 
    'XGBOOST',
    'CATBOOST'
]
In [2]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
# from statsmodels.stats.outliers_influence import variance_inflation_factor
 
import catboost as cb
from sklearn.svm import NuSVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict 
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import shap

%run src/utils.py
In [3]:
coefs = {}
metrics = {}

def collect_results(columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True):
    # coefs
    if skip_coef != True:
        method_coefs = {}
        if hasattr(model, '__intercept'):
            method_coefs['__intercept'] = model.intercept_
        
        for i in range(len(columns.values)):
            method_coefs[columns.values[i]] = abs(model.coef_[i])
        coefs[method] = method_coefs
        df_coefs = pd.DataFrame(coefs)
        df_coefs = df_coefs.sort_values(by=method, ascending=False)
        display(df_coefs)
    
    # metrics
    metrics[method] = {
        'R2 MEAN CV':r2_mean_cv.round(3),
        'R2 STD CV':r2_std_cv.round(3),
        'R2':r2.round(3),
        'MAE':mae.round(3),
        'MSE':mse.round(3)
    }
    
    display(pd.DataFrame(metrics))

def print_feature_importances(method, importances, df):
    feature_score = pd.DataFrame(list(zip(df.dtypes.index, importances)), columns=['Feature','Score'])
    feature_score = feature_score.sort_values(by='Score', 
                                              ascending=True, 
                                              inplace=False, 
                                              kind='quicksort', 
                                              na_position='last')
    
    fig = go.Figure(
        go.Bar(
            x=feature_score['Score'],
            y=feature_score['Feature'],
            orientation='h'
        )
    )
    
    fig.update_layout(
        title=method + " Feature Importance Ranking",
        height=25*len(feature_score)
    )
    
    fig.show()

Carga del dataset

In [4]:
df = pd.read_csv(filename_in)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13363 entries, 0 to 13362
Data columns (total 61 columns):
host_response_time                      13363 non-null object
latitude                                13363 non-null float64
longitude                               13363 non-null float64
property_type                           13363 non-null object
room_type                               13363 non-null object
accommodates                            13363 non-null int64
bathrooms                               13363 non-null float64
bedrooms                                13363 non-null float64
price                                   13363 non-null float64
security_deposit                        13363 non-null float64
cleaning_fee                            13363 non-null float64
guests_included                         13363 non-null int64
extra_people                            13363 non-null float64
minimum_nights_avg_ntm                  13363 non-null float64
maximum_nights_avg_ntm                  13363 non-null float64
number_of_reviews                       13363 non-null int64
number_of_reviews_ltm                   13363 non-null int64
first_review                            13363 non-null object
last_review                             13363 non-null object
review_scores_rating                    13324 non-null float64
review_scores_accuracy                  13325 non-null float64
review_scores_cleanliness               13326 non-null float64
review_scores_checkin                   13326 non-null float64
review_scores_communication             13326 non-null float64
review_scores_location                  13326 non-null float64
review_scores_value                     13326 non-null float64
instant_bookable                        13363 non-null int64
cancellation_policy                     13363 non-null object
reviews_per_month                       13363 non-null float64
district                                13363 non-null object
neighbourhood                           13363 non-null object
has_wifi                                13363 non-null int64
has_essentials                          13363 non-null int64
has_kitchen                             13363 non-null int64
has_heating                             13363 non-null int64
has_washer                              13363 non-null int64
has_hangers                             13363 non-null int64
has_tv                                  13363 non-null int64
has_hair_dryer                          13363 non-null int64
has_iron                                13363 non-null int64
has_shampoo                             13363 non-null int64
has_laptop_friendly_workspace           13363 non-null int64
has_air_conditioning                    13363 non-null int64
has_hot_water                           13363 non-null int64
has_elevator                            13363 non-null int64
has_refrigerator                        13363 non-null int64
has_dishes_and_silverware               13363 non-null int64
has_microwave                           13363 non-null int64
has_bed_linens                          13363 non-null int64
has_no_stairs_or_steps_to_enter         13363 non-null int64
has_coffee_maker                        13363 non-null int64
has_cooking_basics                      13363 non-null int64
has_family/kid_friendly                 13363 non-null int64
has_long_term_stays_allowed             13363 non-null int64
has_first_aid_kit                       13363 non-null int64
has_oven                                13363 non-null int64
has_stove                               13363 non-null int64
has_license                             13363 non-null int64
activity_months                         13363 non-null float64
income_med_occupation                   13363 non-null float64
price_med_occupation_per_accommodate    13363 non-null float64
dtypes: float64(21), int64(32), object(8)
memory usage: 6.2+ MB

Descarte de características

In [5]:
useful_cols = [
    'accommodates',
    'bathrooms',
    'bedrooms',
    'cancellation_policy',
    'cleaning_fee',
    'extra_people',
    'guests_included',
    'has_air_conditioning',
    'has_bed_linens',
    'has_coffee_maker',
    'has_cooking_basics',
    'has_dishes_and_silverware',
    'has_elevator',
    'has_essentials',
    'has_family/kid_friendly',
    'has_first_aid_kit',
    'has_hair_dryer',
    'has_hangers',
    'has_heating',
    'has_hot_water',
    'has_iron',
    'has_kitchen',
    'has_laptop_friendly_workspace',
    'has_license',
    'has_long_term_stays_allowed',
    'has_microwave',
    'has_no_stairs_or_steps_to_enter',
    'has_oven',
    'has_refrigerator',
    'has_shampoo',
    'has_stove',
    'has_tv',
    'has_washer',
    'has_wifi',
    'instant_bookable',
    'latitude',
    'longitude',
    'maximum_nights_avg_ntm',
    'minimum_nights_avg_ntm',
    'neighbourhood',
    'price',
    'property_type',
    'room_type',
    'security_deposit'
]

useless_cols = [
    'district',
    'neighbourhood',
    'income_med_occupation',
    'price_med_occupation_per_accommodate',
    'activity_months',
    'host_response_time',
    'first_review',
    'last_review',
    'number_of_reviews',
    'number_of_reviews_ltm',
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'reviews_per_month'
]

highly_corr_cols = [
    'host_verified_by_selfie'
]

df.drop([*useless_cols, *highly_corr_cols], axis=1, errors='ignore', inplace=True)
df.shape
Out[5]:
(13363, 43)

Conversión de características categóricas en dummies

In [6]:
print(df.shape)
dfd = pd.get_dummies(df)
print(dfd.shape)

target = 'price'
features = list(dfd.columns)
features.remove(target)
(13363, 43)
(13363, 54)

Multicolinealidad

In [7]:
"""
vif = pd.DataFrame()
vif['vif'] = [variance_inflation_factor(dfd[features].values, i) for i in range(dfd[features].shape[1])]
vif['feature'] = dfd[features].columns
vif_results = vif.round(1).sort_values(by='vif', ascending=False)
"""
Out[7]:
"\nvif = pd.DataFrame()\nvif['vif'] = [variance_inflation_factor(dfd[features].values, i) for i in range(dfd[features].shape[1])]\nvif['feature'] = dfd[features].columns\nvif_results = vif.round(1).sort_values(by='vif', ascending=False)\n"
In [8]:
"""
collineal_features = vif_results[vif_results['vif'] > 20]['feature'].values
print('Collineal features: ', sorted(collineal_features))
df.drop(collineal_features, axis=1, inplace=True)
"""
Out[8]:
"\ncollineal_features = vif_results[vif_results['vif'] > 20]['feature'].values\nprint('Collineal features: ', sorted(collineal_features))\ndf.drop(collineal_features, axis=1, inplace=True)\n"
In [9]:
"""
features = list(dfd.columns)
features.remove(target)
dfd.shape
"""
Out[9]:
'\nfeatures = list(dfd.columns)\nfeatures.remove(target)\ndfd.shape\n'

Partición en conjuntos de entrenamiento y test

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    dfd[features], 
    dfd[target],
    test_size=0.3,
    random_state=42
)

x_train = x_train.astype(float) # prevent conversion warnings

alphas = np.array([100000, 10000, 1000, 100, 1, 0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001])

Comparativa de modelos de regresión

Regresión Lineal

In [11]:
method = 'LINEAL'
if method in enabled_models:
    model = LinearRegression(normalize=True)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    param_grid = {}

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])
    
    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)
    
    y_pred = regressor.predict(x_test)
    
    r2 = r2_score(y_test.to_list(), y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.560912514360403
best estimator LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

Resultados

In [12]:
if method in enabled_models:
    collect_results(dfd[features].columns, regressor, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
LINEAL
MAE 4.382140e+09
MSE 3.849272e+22
R2 -1.757365e+19
R2 MEAN CV 5.610000e-01
R2 STD CV 2.600000e-02

Regresión Lineal Ridge

In [13]:
method = 'RIDGE'
if method in enabled_models:
    model = Ridge(normalize=True, random_state=42)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    param_grid = {
        'model__alpha':[0.1], 
        'model__tol':[0.001]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])
    
    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)

    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.5610274297981934
best estimator Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=42, solver='auto', tol=0.001)

Resultados

In [14]:
if method in enabled_models:
    collect_results(dfd[features].columns, regressor.best_estimator_, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
LINEAL RIDGE
R2 MEAN CV 5.610000e-01 0.561
R2 STD CV 2.600000e-02 0.025
R2 -1.757365e+19 -10.959
MAE 4.382140e+09 21.835
MSE 3.849272e+22 26194.607

Regresión Lineal Lasso

In [15]:
method = 'LASSO'
if method in enabled_models:
    model = Lasso(normalize=True, random_state=42)

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    param_grid = {
        'model__alpha':[0.001, 0.1], 
        'model__tol':[0.001, 0.01],
        'model__max_iter':[100, 200]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])

    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)
    
    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.5611723580587145
best estimator Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=100,
      normalize=True, positive=False, precompute=False, random_state=42,
      selection='cyclic', tol=0.001, warm_start=False)

Resultados

In [16]:
if method in enabled_models:
    collect_results(dfd[features].columns, regressor.best_estimator_, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
LINEAL RIDGE LASSO
R2 MEAN CV 5.610000e-01 0.561 0.561
R2 STD CV 2.600000e-02 0.025 0.026
R2 -1.757365e+19 -10.959 -3.371
MAE 4.382140e+09 21.835 20.765
MSE 3.849272e+22 26194.607 9573.931

Regresión Lineal Stochastic Gradient Descent

In [17]:
method = 'SGD'
if method in enabled_models:
    model = SGDRegressor(random_state=42)

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    param_grid = {
        'model__alpha':alphas,
        'model__penalty':['l1', 'l2'],
        'model__tol':[0.3],
        'model__max_iter':[500]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])

    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)

    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.5565634471784409
best estimator SGDRegressor(alpha=0.1, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=500,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=42,
             shuffle=True, tol=0.3, validation_fraction=0.1, verbose=0,
             warm_start=False)
The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.

Resultados

In [18]:
if method in enabled_models:
    collect_results(dfd[features].columns, regressor.best_estimator_.named_steps['model'], method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
LINEAL RIDGE LASSO SGD
R2 MEAN CV 5.610000e-01 0.561 0.561 0.555
R2 STD CV 2.600000e-02 0.025 0.026 0.024
R2 -1.757365e+19 -10.959 -3.371 -7.004
MAE 4.382140e+09 21.835 20.765 21.697
MSE 3.849272e+22 26194.607 9573.931 17532.570

Regresión Random Forest

In [19]:
method = 'R.FOREST'
if method in enabled_models:
    model = RandomForestRegressor(random_state=42)
    pipeline = Pipeline([('model', model)])

    param_grid = {
        #'model__max_depth':[15],
        'model__n_estimators':[200]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])

    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)

    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.699771566801889
best estimator RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

Resultados

In [20]:
if method in enabled_models:
    collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
    importances = regressor.best_estimator_.named_steps['model'].feature_importances_
    print_feature_importances(method, importances, dfd[features])
LINEAL RIDGE LASSO SGD R.FOREST
R2 MEAN CV 5.610000e-01 0.561 0.561 0.555 0.700
R2 STD CV 2.600000e-02 0.025 0.026 0.024 0.024
R2 -1.757365e+19 -10.959 -3.371 -7.004 0.722
MAE 4.382140e+09 21.835 20.765 21.697 14.889
MSE 3.849272e+22 26194.607 9573.931 17532.570 608.245

Predicciones de ejemplo

In [53]:
example = x_train.iloc[0].copy()
smodel = regressor.best_estimator_.named_steps['model']
In [54]:
# ejemplo 1 piso mediano en Barrio del Pilar con las siguientes características:

example['latitude'] = 40.4762958
example['longitude'] = -3.7045541
example['accommodates'] = 4
example['bathrooms'] = 1
example['bedrooms'] = 2
example['security_deposit'] = 35
example['cleaning_fee'] = 50
example['guests_included'] = 2
example['extra_people'] = 15
example['minimum_nights_avg_ntm'] = 1
example['maximum_nights_avg_ntm'] = 60
example['instant_bookable'] = 0
example['has_wifi'] = 1
example['has_essentials'] = 1
example['has_kitchen'] = 1
example['has_heating'] = 1
example['has_washer'] = 1
example['has_hangers'] = 1
example['has_tv'] = 1
example['has_hair_dryer'] = 1
example['has_iron'] = 1
example['has_shampoo'] = 1
example['has_laptop_friendly_workspace'] = 1
example['has_air_conditioning'] = 1
example['has_hot_water'] = 1
example['has_elevator'] = 1
example['has_refrigerator'] = 1
example['has_dishes_and_silverware'] = 1
example['has_microwave'] = 1
example['has_bed_linens'] = 1
example['has_no_stairs_or_steps_to_enter'] = 1
example['has_coffee_maker'] = 1
example['has_cooking_basics'] = 1
example['has_family/kid_friendly'] = 1
example['has_long_term_stays_allowed'] = 0
example['has_first_aid_kit'] = 0
example['has_oven'] = 0
example['has_stove'] = 0
example['has_license'] = 1
example['property_type_Apartment'] = 1
example['property_type_Chalet'] = 0
example['property_type_Condominium'] = 0
example['property_type_House'] = 0
example['property_type_Loft'] = 0
example['room_type_Entire home/apt'] = 1
example['room_type_Private room'] = 0
example['room_type_Shared room'] = 0
example['cancellation_policy_flexible'] = 0
example['cancellation_policy_moderate'] = 0
example['cancellation_policy_strict'] = 0
example['cancellation_policy_strict_14_with_grace_period'] = 1
example['cancellation_policy_super_strict_30'] = 0
example['cancellation_policy_super_strict_60'] = 0

example_prediction = smodel.predict([example])
print(example_prediction)
[93.6135]
In [55]:
# ejemplo 2 piso pequeño en La Latina con las siguientes características:

example['latitude'] = 40.4120087
example['longitude'] = -3.7092935
example['accommodates'] = 2
example['bathrooms'] = 1
example['bedrooms'] = 1
example['security_deposit'] = 50
example['cleaning_fee'] = 50
example['guests_included'] = 2
example['extra_people'] = 0
example['minimum_nights_avg_ntm'] = 1
example['maximum_nights_avg_ntm'] = 15
example['instant_bookable'] = 1
example['has_wifi'] = 1
example['has_essentials'] = 1
example['has_kitchen'] = 1
example['has_heating'] = 1
example['has_washer'] = 1
example['has_hangers'] = 1
example['has_tv'] = 0
example['has_hair_dryer'] = 1
example['has_iron'] = 1
example['has_shampoo'] = 1
example['has_laptop_friendly_workspace'] = 1
example['has_air_conditioning'] = 1
example['has_hot_water'] = 1
example['has_elevator'] = 0
example['has_refrigerator'] = 1
example['has_dishes_and_silverware'] = 1
example['has_microwave'] = 1
example['has_bed_linens'] = 1
example['has_no_stairs_or_steps_to_enter'] = 1
example['has_coffee_maker'] = 1
example['has_cooking_basics'] = 1
example['has_family/kid_friendly'] = 1
example['has_long_term_stays_allowed'] = 0
example['has_first_aid_kit'] = 0
example['has_oven'] = 0
example['has_stove'] = 0
example['has_license'] = 0
example['property_type_Apartment'] = 1
example['property_type_Chalet'] = 0
example['property_type_Condominium'] = 0
example['property_type_House'] = 0
example['property_type_Loft'] = 0
example['room_type_Entire home/apt'] = 1
example['room_type_Private room'] = 0
example['room_type_Shared room'] = 0
example['cancellation_policy_flexible'] = 0
example['cancellation_policy_moderate'] = 0
example['cancellation_policy_strict'] = 0
example['cancellation_policy_strict_14_with_grace_period'] = 0
example['cancellation_policy_super_strict_30'] = 1
example['cancellation_policy_super_strict_60'] = 0

example_prediction = smodel.predict([example])
print(example_prediction)
[81.135]

Interpretación de los resultados con SHAP

In [21]:
smodel = regressor.best_estimator_.named_steps['model']
explainer = shap.TreeExplainer(smodel)
shap.initjs()
shap_values = explainer.shap_values(dfd[features])
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
In [22]:
# summarize the effects of all the features
shap.summary_plot(shap_values, dfd[features], plot_type="bar")
In [23]:
# summarize the effects of all the features
shap.summary_plot(shap_values, dfd[features])
In [24]:
shap.dependence_plot('guests_included', shap_values, dfd[features], interaction_index=None)
In [25]:
shap.dependence_plot('extra_people', shap_values, dfd[features], interaction_index=None)
In [26]:
shap.dependence_plot('accommodates', shap_values, dfd[features], interaction_index=None)
In [27]:
shap.dependence_plot('cleaning_fee', shap_values, dfd[features], interaction_index=None)
In [28]:
shap.dependence_plot('longitude', shap_values, dfd[features], interaction_index=None)
In [29]:
shap.dependence_plot('longitude', shap_values, dfd[features], interaction_index='latitude')
In [30]:
shap.dependence_plot('latitude', shap_values, dfd[features], interaction_index=None)
In [31]:
shap.dependence_plot('latitude', shap_values, dfd[features], interaction_index='longitude')
In [32]:
# visualize the 4000th prediction's explanation
i = 4000
print(dfd.iloc[i,:])
shap.force_plot(explainer.expected_value, shap_values[i,:], dfd[features].iloc[i,:])
latitude                                           40.41486
longitude                                          -3.70756
accommodates                                        4.00000
bathrooms                                           2.00000
bedrooms                                            1.00000
price                                              67.50000
security_deposit                                    0.00000
cleaning_fee                                       45.00000
guests_included                                     2.00000
extra_people                                       13.50000
minimum_nights_avg_ntm                              1.10000
maximum_nights_avg_ntm                             31.00000
instant_bookable                                    1.00000
has_wifi                                            1.00000
has_essentials                                      1.00000
has_kitchen                                         1.00000
has_heating                                         1.00000
has_washer                                          1.00000
has_hangers                                         1.00000
has_tv                                              0.00000
has_hair_dryer                                      1.00000
has_iron                                            1.00000
has_shampoo                                         1.00000
has_laptop_friendly_workspace                       0.00000
has_air_conditioning                                1.00000
has_hot_water                                       0.00000
has_elevator                                        0.00000
has_refrigerator                                    0.00000
has_dishes_and_silverware                           0.00000
has_microwave                                       0.00000
has_bed_linens                                      0.00000
has_no_stairs_or_steps_to_enter                     0.00000
has_coffee_maker                                    0.00000
has_cooking_basics                                  0.00000
has_family/kid_friendly                             1.00000
has_long_term_stays_allowed                         0.00000
has_first_aid_kit                                   0.00000
has_oven                                            0.00000
has_stove                                           0.00000
has_license                                         1.00000
property_type_Apartment                             0.00000
property_type_Chalet                                0.00000
property_type_Condominium                           0.00000
property_type_House                                 0.00000
property_type_Loft                                  1.00000
room_type_Entire home/apt                           1.00000
room_type_Private room                              0.00000
room_type_Shared room                               0.00000
cancellation_policy_flexible                        0.00000
cancellation_policy_moderate                        0.00000
cancellation_policy_strict                          0.00000
cancellation_policy_strict_14_with_grace_period     1.00000
cancellation_policy_super_strict_30                 0.00000
cancellation_policy_super_strict_60                 0.00000
Name: 4000, dtype: float64
Out[32]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [33]:
# visualize the 8000th prediction's explanation
i = 8000
print(dfd.iloc[i,:])
shap.force_plot(explainer.expected_value, shap_values[i,:], dfd[features].iloc[i,:])
latitude                                             40.40719
longitude                                            -3.69536
accommodates                                          5.00000
bathrooms                                             2.00000
bedrooms                                              3.00000
price                                                90.00000
security_deposit                                    180.00000
cleaning_fee                                         45.00000
guests_included                                       2.00000
extra_people                                         13.50000
minimum_nights_avg_ntm                                3.00000
maximum_nights_avg_ntm                             1111.00000
instant_bookable                                      1.00000
has_wifi                                              1.00000
has_essentials                                        1.00000
has_kitchen                                           1.00000
has_heating                                           1.00000
has_washer                                            1.00000
has_hangers                                           1.00000
has_tv                                                1.00000
has_hair_dryer                                        1.00000
has_iron                                              1.00000
has_shampoo                                           1.00000
has_laptop_friendly_workspace                         1.00000
has_air_conditioning                                  0.00000
has_hot_water                                         1.00000
has_elevator                                          1.00000
has_refrigerator                                      1.00000
has_dishes_and_silverware                             1.00000
has_microwave                                         0.00000
has_bed_linens                                        1.00000
has_no_stairs_or_steps_to_enter                       1.00000
has_coffee_maker                                      1.00000
has_cooking_basics                                    1.00000
has_family/kid_friendly                               1.00000
has_long_term_stays_allowed                           1.00000
has_first_aid_kit                                     0.00000
has_oven                                              1.00000
has_stove                                             0.00000
has_license                                           0.00000
property_type_Apartment                               1.00000
property_type_Chalet                                  0.00000
property_type_Condominium                             0.00000
property_type_House                                   0.00000
property_type_Loft                                    0.00000
room_type_Entire home/apt                             1.00000
room_type_Private room                                0.00000
room_type_Shared room                                 0.00000
cancellation_policy_flexible                          0.00000
cancellation_policy_moderate                          0.00000
cancellation_policy_strict                            0.00000
cancellation_policy_strict_14_with_grace_period       1.00000
cancellation_policy_super_strict_30                   0.00000
cancellation_policy_super_strict_60                   0.00000
Name: 8000, dtype: float64
Out[33]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [34]:
# visualize the 12000th prediction's explanation
i = 12000
print(dfd.iloc[i,:])
shap.force_plot(explainer.expected_value, shap_values[i,:], dfd[features].iloc[i,:])
latitude                                             40.36937
longitude                                            -3.69521
accommodates                                          2.00000
bathrooms                                             1.00000
bedrooms                                              1.00000
price                                                28.80000
security_deposit                                      0.00000
cleaning_fee                                          0.00000
guests_included                                       1.00000
extra_people                                          0.00000
minimum_nights_avg_ntm                                1.00000
maximum_nights_avg_ntm                             1125.00000
instant_bookable                                      0.00000
has_wifi                                              1.00000
has_essentials                                        1.00000
has_kitchen                                           1.00000
has_heating                                           1.00000
has_washer                                            1.00000
has_hangers                                           0.00000
has_tv                                                1.00000
has_hair_dryer                                        1.00000
has_iron                                              1.00000
has_shampoo                                           1.00000
has_laptop_friendly_workspace                         0.00000
has_air_conditioning                                  1.00000
has_hot_water                                         1.00000
has_elevator                                          1.00000
has_refrigerator                                      0.00000
has_dishes_and_silverware                             0.00000
has_microwave                                         0.00000
has_bed_linens                                        0.00000
has_no_stairs_or_steps_to_enter                       0.00000
has_coffee_maker                                      0.00000
has_cooking_basics                                    0.00000
has_family/kid_friendly                               0.00000
has_long_term_stays_allowed                           0.00000
has_first_aid_kit                                     0.00000
has_oven                                              0.00000
has_stove                                             0.00000
has_license                                           0.00000
property_type_Apartment                               1.00000
property_type_Chalet                                  0.00000
property_type_Condominium                             0.00000
property_type_House                                   0.00000
property_type_Loft                                    0.00000
room_type_Entire home/apt                             0.00000
room_type_Private room                                1.00000
room_type_Shared room                                 0.00000
cancellation_policy_flexible                          0.00000
cancellation_policy_moderate                          1.00000
cancellation_policy_strict                            0.00000
cancellation_policy_strict_14_with_grace_period       0.00000
cancellation_policy_super_strict_30                   0.00000
cancellation_policy_super_strict_60                   0.00000
Name: 12000, dtype: float64
Out[34]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

Regresión AdaBoost

In [35]:
method = 'ADABOOST'
if method in enabled_models:
    model = AdaBoostRegressor(random_state=42)
    pipeline = Pipeline([('model', model)])

    param_grid = {
        'model__n_estimators':[175],
        'model__learning_rate':[0.01]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=2)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])

    scores = cross_val_score(regressor, x_train, y_train, cv=2)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)

    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.5648146449487732
best estimator AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='linear',
                  n_estimators=175, random_state=42)

Resultados

In [36]:
if method in enabled_models:
    collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
    importances = regressor.best_estimator_.named_steps['model'].feature_importances_
    print_feature_importances(method, importances, dfd[features])
LINEAL RIDGE LASSO SGD R.FOREST ADABOOST
R2 MEAN CV 5.610000e-01 0.561 0.561 0.555 0.700 0.565
R2 STD CV 2.600000e-02 0.025 0.026 0.024 0.024 0.015
R2 -1.757365e+19 -10.959 -3.371 -7.004 0.722 0.568
MAE 4.382140e+09 21.835 20.765 21.697 14.889 20.308
MSE 3.849272e+22 26194.607 9573.931 17532.570 608.245 947.290

Regresión XGBoost

In [37]:
method = 'XGBOOST'
if method in enabled_models:
    model = XGBRegressor(random_state=42)
    pipeline = Pipeline([('model', model)])

    param_grid = {
        'model__depth':[10],
        'model__iterations':[150],
        'model__learning_rate':[0.2]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'])

    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)

    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
Series.base is deprecated and will be removed in a future version
[08:23:47] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:23:49] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:23:51] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:23:53] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:23:55] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:23:56] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
best score 0.6905992895775445
best estimator XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, depth=10, gamma=0,
             importance_type='gain', iterations=150, learning_rate=0.2,
             max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
             n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)
[08:23:58] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:00] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:01] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:03] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:04] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:06] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:07] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:09] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:10] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:12] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:13] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:14] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:16] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:18] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:19] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:20] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:22] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:23] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:25] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:27] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:28] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:29] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:31] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:32] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:34] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:35] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:37] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:38] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:40] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Series.base is deprecated and will be removed in a future version
[08:24:41] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

Resultados

In [38]:
if method in enabled_models:
    collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
    importances = regressor.best_estimator_.named_steps['model'].feature_importances_
    print_feature_importances(method, importances, dfd[features])
LINEAL RIDGE LASSO SGD R.FOREST ADABOOST XGBOOST
R2 MEAN CV 5.610000e-01 0.561 0.561 0.555 0.700 0.565 0.691
R2 STD CV 2.600000e-02 0.025 0.026 0.024 0.024 0.015 0.027
R2 -1.757365e+19 -10.959 -3.371 -7.004 0.722 0.568 0.705
MAE 4.382140e+09 21.835 20.765 21.697 14.889 20.308 15.739
MSE 3.849272e+22 26194.607 9573.931 17532.570 608.245 947.290 646.123

Regresión CatBoost

In [39]:
method = 'CATBOOST'
if method in enabled_models:
    model = cb.CatBoostRegressor(verbose=0, random_seed=42)
    pipeline = Pipeline([('model', model)])

    param_grid = {
        'model__depth':[10],
        'model__iterations':[150],
        'model__learning_rate':[0.1]
    }

    regressor = GridSearchCV(pipeline, param_grid, scoring='r2', cv=5)
    regressor.fit(x_train, y_train)
    print('best score', regressor.best_score_)
    print('best estimator', regressor.best_estimator_.named_steps['model'].get_params())

    scores = cross_val_score(regressor, x_train, y_train, cv=5)
    r2_mean_cv = np.mean(scores)
    r2_std_cv = np.std(scores)

    y_pred = regressor.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
best score 0.7094824764055004
best estimator {'verbose': 0, 'random_seed': 42, 'loss_function': 'RMSE', 'depth': 10, 'iterations': 150, 'learning_rate': 0.1}

Resultados

In [40]:
if method in enabled_models:
    collect_results(dfd[features].columns, model, method, r2_mean_cv, r2_std_cv, r2, mae, mse, skip_coef=True)
    importances = regressor.best_estimator_.named_steps['model'].feature_importances_
    print_feature_importances(method, importances, dfd[features])
LINEAL RIDGE LASSO SGD R.FOREST ADABOOST XGBOOST CATBOOST
R2 MEAN CV 5.610000e-01 0.561 0.561 0.555 0.700 0.565 0.691 0.709
R2 STD CV 2.600000e-02 0.025 0.026 0.024 0.024 0.015 0.027 0.026
R2 -1.757365e+19 -10.959 -3.371 -7.004 0.722 0.568 0.705 0.707
MAE 4.382140e+09 21.835 20.765 21.697 14.889 20.308 15.739 14.905
MSE 3.849272e+22 26194.607 9573.931 17532.570 608.245 947.290 646.123 642.482

Modelo base seleccionado: Random Forest